# encoding: utf-8

import json

with open("./datas/data_source.json", "r", encoding="utf-8") as f:
    data_rows = json.load(f).get("RECORDS", [])

result = []

for data in data_rows:
    sku_name = data.get("skuName")
    cat_id = data.get("catalog3Id")
    if sku_name and cat_id:
        sku_name = "".join(sku_name.split())
        result.append(f"{sku_name.strip()}__{cat_id.strip()}")

trains = result[:-1000]
tests = result[-1000:]


with open("./datas/train.txt", "w", encoding="utf-8") as f:
    for line in trains:
        f.write(line+"\n")

with open("./datas/test.txt", "w", encoding="utf-8") as f:
    for line in tests:
        f.write(line+"\n")

print(f"train.len={len(trains)}, test.len={len(tests)}")
# train.len=59422, test.len=1000
